Load packages

The tidyverse and caret packages are loaded below.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift

Read and prepare data

The code chunk below reads in the df_all data and displays

df_all <- readr::read_csv("final_project_train.csv", col_names = TRUE)
## Rows: 677 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): region, customer, outcome
## dbl (35): rowid, xb_01, xb_02, xb_03, xn_01, xn_02, xn_03, xa_01, xa_02, xa_...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_all %>%glimpse()
## Rows: 677
## Columns: 38
## $ rowid    <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, 27,…
## $ region   <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "…
## $ customer <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ xb_01    <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429, 3…
## $ xb_02    <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13, 10…
## $ xb_03    <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4, -3…
## $ xn_01    <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8571…
## $ xn_02    <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, 6, …
## $ xn_03    <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5, -…
## $ xa_01    <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.857143, …
## $ xa_02    <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, 26,…
## $ xa_03    <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4, -5…
## $ xb_04    <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1857…
## $ xb_05    <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0.00…
## $ xb_06    <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000, 6…
## $ xb_07    <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286, 1…
## $ xb_08    <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.35483871,…
## $ xn_04    <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5607…
## $ xn_05    <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -1.0…
## $ xn_06    <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5, 2…
## $ xn_07    <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857, 1…
## $ xn_08    <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129, -0…
## $ xa_04    <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714, 2…
## $ xa_05    <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0.40…
## $ xa_06    <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.000000, …
## $ xa_07    <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429, 4…
## $ xa_08    <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3571…
## $ xw_01    <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333, 6…
## $ xw_02    <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4, 2…
## $ xw_03    <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 103, …
## $ xs_01    <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442957…
## $ xs_02    <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.12204299,…
## $ xs_03    <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6540…
## $ xs_04    <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2594…
## $ xs_05    <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.043…
## $ xs_06    <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8672…
## $ response <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279, 1…
## $ outcome  <chr> "non_event", "non_event", "event", "non_event", "non_event", …

##Added log-transformed response

log_all<-df_all %>%mutate(logresponse =log(response))
log_all%>%glimpse()
## Rows: 677
## Columns: 39
## $ rowid       <dbl> 1, 3, 4, 5, 8, 9, 11, 14, 15, 16, 17, 18, 19, 22, 24, 25, …
## $ region      <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX"…
## $ customer    <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B"…
## $ xb_01       <dbl> 4.000000, 1.000000, 2.000000, 2.520000, 2.548387, 3.071429…
## $ xb_02       <dbl> 4, 1, 2, 11, 6, 6, 10, 12, 9, 10, 8, 10, 10, 8, 6, 10, 13,…
## $ xb_03       <dbl> 4, 1, 2, -6, -1, 1, -4, -4, -2, -4, -2, -2, -2, -4, 1, -4,…
## $ xn_01       <dbl> 3.0000000, 2.0000000, 2.0000000, 1.5333333, 0.8387097, 1.8…
## $ xn_02       <dbl> 3, 2, 4, 9, 3, 8, 6, 10, 10, 4, 6, 8, 9, 5, 7, 12, 12, 6, …
## $ xn_03       <dbl> 3, 2, 0, -3, -4, -2, -5, -6, -3, -5, -3, -6, -4, -3, 0, -5…
## $ xa_01       <dbl> 12.000000, 3.000000, 9.000000, 7.080000, 6.451613, 6.85714…
## $ xa_02       <dbl> 12, 3, 9, 29, 17, 18, 24, 27, 20, 19, 15, 24, 24, 15, 14, …
## $ xa_03       <dbl> 12, 3, 9, -7, -2, 2, -9, -5, -3, -3, -1, 1, -2, -3, 3, -4,…
## $ xb_04       <dbl> 1.3333333, 1.0000000, 1.0000000, 0.8950476, 1.2247312, 1.1…
## $ xb_05       <dbl> 1.3333333, 1.0000000, 1.0000000, -2.0000000, -0.5000000, 0…
## $ xb_06       <dbl> 1.333333, 1.000000, 1.000000, 4.000000, 4.000000, 3.000000…
## $ xb_07       <dbl> 4.000000, 1.000000, 2.000000, 1.933333, 1.967742, 1.714286…
## $ xb_08       <dbl> -1.00000000, 1.00000000, 0.00000000, -0.08000000, 0.354838…
## $ xn_04       <dbl> 1.0000000, 2.0000000, 1.0000000, 0.5268889, 0.4688172, 0.5…
## $ xn_05       <dbl> 1.0000000, 2.0000000, 0.0000000, -1.0000000, -1.3333333, -…
## $ xn_06       <dbl> 1.0, 2.0, 2.0, 2.5, 3.0, 2.0, 4.0, 4.0, 3.0, 2.0, 2.0, 2.5…
## $ xn_07       <dbl> 3.000000, 2.000000, 2.500000, 1.493333, 1.225806, 1.642857…
## $ xn_08       <dbl> -1.0000000, 2.0000000, -1.0000000, -0.4400000, -0.4516129,…
## $ xa_04       <dbl> 6.000000, 3.000000, 6.750000, 2.425333, 3.023656, 2.685714…
## $ xa_05       <dbl> 6.0000000, 3.0000000, 4.5000000, -3.5000000, -0.6666667, 0…
## $ xa_06       <dbl> 6.000000, 3.000000, 9.000000, 9.000000, 13.000000, 6.00000…
## $ xa_07       <dbl> 9.000000, 3.000000, 7.500000, 4.466667, 4.612903, 4.071429…
## $ xa_08       <dbl> 3.0000000, 3.0000000, 6.0000000, 0.7066667, 1.3225806, 1.3…
## $ xw_01       <dbl> 23.00000, 17.00000, 52.50000, 64.52564, 54.75758, 58.33333…
## $ xw_02       <dbl> 23, 17, 48, 0, 12, 15, 0, 0, 0, 7, 14, 0, 0, 0, 8, 8, 0, 4…
## $ xw_03       <dbl> 23, 17, 57, 106, 105, 101, 107, 109, 109, 104, 109, 99, 10…
## $ xs_01       <dbl> 0.262073307, 0.330804757, 0.239795763, 0.142106837, 0.2442…
## $ xs_02       <dbl> 0.26207331, 0.33080476, 0.19049123, -0.73321509, -0.122042…
## $ xs_03       <dbl> 0.2620733, 0.3308048, 0.2891003, 0.5500723, 1.3134719, 0.6…
## $ xs_04       <dbl> 0.5375576, 0.4286607, 0.3676937, 0.2865445, 0.2375470, 0.2…
## $ xs_05       <dbl> 0.5375575604, 0.4286607050, 0.2485001680, 0.0000000000, 0.…
## $ xs_06       <dbl> 0.5375576, 0.4286607, 0.4868872, 0.6357541, 0.4327004, 0.8…
## $ response    <dbl> 2.617991, 1.184632, 2.216626, 2.726715, 1.483323, 2.039279…
## $ outcome     <chr> "non_event", "non_event", "event", "non_event", "non_event…
## $ logresponse <dbl> 0.9624073, 0.1694321, 0.7959862, 1.0030975, 0.3942847, 0.7…

creates a long-format dataset by gathering the input features into a single column via the pivot_longer() function.

lf_all <- log_all %>% 
  tibble::rowid_to_column("obs_id") %>% 
  pivot_longer(!c("obs_id","outcome", "customer", "region", "response", "rowid","logresponse")) %>%
  mutate(input_id = str_sub(name, 2, 5))
lf_all %>% glimpse()
## Rows: 22,341
## Columns: 10
## $ obs_id      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ rowid       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ region      <chr> "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX", "XX"…
## $ customer    <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B"…
## $ response    <dbl> 2.617991, 2.617991, 2.617991, 2.617991, 2.617991, 2.617991…
## $ outcome     <chr> "non_event", "non_event", "non_event", "non_event", "non_e…
## $ logresponse <dbl> 0.9624073, 0.9624073, 0.9624073, 0.9624073, 0.9624073, 0.9…
## $ name        <chr> "xb_01", "xb_02", "xb_03", "xn_01", "xn_02", "xn_03", "xa_…
## $ value       <dbl> 4.000000, 4.000000, 4.000000, 3.000000, 3.000000, 3.000000…
## $ input_id    <chr> "b_01", "b_02", "b_03", "n_01", "n_02", "n_03", "a_01", "a…

Part i: Exploration

1.Visualize the distributions of variables in the data set.

1a) Counts for categorical variables

lf_all%>% ggplot(mapping = aes(x=outcome))+
  geom_bar(mapping = aes(y = stat(prop),
                         group = 1)) +
  geom_text(stat = 'count',
            mapping = aes(y = after_stat( count / sum(count) ),
                          label = after_stat( signif(count / sum(count), 3) )),
            color = 'red', nudge_y = 0.01, size = 3) +
        theme_bw()

mean(df_all$outcome =="event")
## [1] 0.1875923
df_all%>% ggplot(mapping = aes(x=region))+
       geom_bar(mapping = aes(y = stat(prop),
                         group = 1)) +
       geom_text(stat = 'count',
            mapping = aes(y = after_stat( count / sum(count) ),
                          label = after_stat( signif(count / sum(count), 3) )),
            color = 'red', nudge_y = 0.01, size = 3) +
        theme_bw()

df_all%>% ggplot(mapping = aes(x=customer))+
       geom_bar(mapping = aes(y = stat(prop),
                         group = 1)) +
       geom_text(stat = 'count',
            mapping = aes(y = after_stat( count / sum(count) ),
                          label = after_stat( signif(count / sum(count), 3) )),
            color = 'red', nudge_y = 0.01, size = 3) +
        theme_bw()

##1b) Distributions for continuous variables.

###
lf_all%>%ggplot(mapping = aes(x=input_id,y = value))+
           geom_boxplot(mapping =aes(group=input_id))

lf_all%>% ggplot(mapping = aes(x = value))+
            geom_freqpoly(mapping = aes(group=input_id),bins=21 )+
            facet_wrap(~input_id, scales = 'free_y')

lf_all%>% ggplot(mapping = aes(x=value))+
        geom_histogram(bin=5)+
        facet_wrap(~name,scales = 'free')+
        theme_bw()
## Warning: Ignoring unknown parameters: bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

From the histgram we can see that most of the the distributions are Gaussian like. But some are not, like xw_02,xw_03.

2.Consider conditioning (grouping or “breaking up”) the continuous variables based on the categorical variables.

2a)Based on region

lf_all%>%ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, region),
                             fill = region,
                             color = region),
               alpha = 0.25) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

lf_all%>% ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, region),
                             fill = region,
                             color = region),
               alpha = 0.1) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, region),
                             color = region),
               position = position_dodge(0.75)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

lf_all%>% ggplot(mapping = aes(x = as.factor(input_id), y = value)) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, region),
                             color = region)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

2b)Based on customer

lf_all%>%ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, customer),
                             fill = customer,
                             color = customer),
               alpha = 0.25) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")
## Warning: This manual palette can handle a maximum of 8 values. You have supplied
## 9.

## Warning: This manual palette can handle a maximum of 8 values. You have supplied
## 9.

lf_all%>%ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, customer),
                             fill = customer,
                             color = customer),
               alpha = 0.1) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, customer),
                             color = customer),
               position = position_dodge(0.75)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")
## Warning: This manual palette can handle a maximum of 8 values. You have supplied
## 9.

## Warning: This manual palette can handle a maximum of 8 values. You have supplied
## 9.

lf_all%>%ggplot(mapping = aes(x = as.factor(input_id), y = value)) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, customer),
                             color = customer)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")
## Warning: This manual palette can handle a maximum of 8 values. You have supplied
## 9.

Based on the categorical variables (region and customers), there are no significant difference in continuous variable distribution and their summary statics for Bing lexicon (xb), NRC lexicon (xn), AFINN (xa) and sedimentr derived features (xs). But it can be seen that word count derived features (xw) are different across region and customers.

2c) Based on the binary outcome

lf_all%>%ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, outcome),
                             fill = outcome,
                             color = outcome),
               alpha = 0.25) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

lf_all%>%ggplot(mapping = aes(x = input_id, y = value)) +
  geom_boxplot(mapping = aes(group = interaction(input_id, outcome),
                             fill = outcome,
                             color = outcome),
               alpha = 0.1) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, outcome),
                             color = outcome),
               position = position_dodge(0.75)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

lf_all%>%ggplot(mapping = aes(x = as.factor(input_id), y = value)) +
  stat_summary(fun.data = 'mean_se',
               fun.args = list(mult = 2),
               mapping = aes(group = interaction(input_id, outcome),
                             color = outcome)) +
  ggthemes::scale_color_colorblind() +
  ggthemes::scale_fill_colorblind() +
  theme_bw() +
  theme(legend.position = "top")

Based on the binary outcome, there are no significant difference in continuous variable distribution and their summary statics for Bing lexicon (xb), NRC lexicon (xn), AFINN (xa) and sedimentr derived features (xs). But word count derived features (xw) showed the different outcomes.

###3. Visualize the relationships between the continuous inputs?

df_all %>% 
  select(starts_with('x')) %>% 
  cor() %>% 
  corrplot::corrplot()

Most of the inputs are not correlated.

4.Visualize the relationships between the continuous outputs (response and thelog-transformed response) with respect to the continuous inputs.

df_all %>% 
  ggplot(mapping = aes(x = xb_04, y = response)) +
  geom_point() +
  theme_bw()

lf_all %>% 
  ggplot(mapping = aes(x = value, y = response)) +
  geom_point() +
  facet_wrap(~name, scales = "free")+
  theme_bw()

log_all %>% 
  ggplot(mapping = aes(x = xa_01, y = logresponse)) +
  geom_point() +
  theme_bw()

lf_all %>% 
  ggplot(mapping = aes(x = value, y = logresponse)) +
  geom_point() +
  facet_wrap(~name, scales = "free")+
 theme_bw()

log_all %>% 
  ggplot(mapping = aes(x = xb_07, y = logresponse)) +
  geom_point() +
  theme_bw()

The trends seem like a linear distribution in the Log_transformed response. It’s hard to tell that the trends depends on the categorical inputs.

###5.How can you visualize the behavior of the binary outcome with respect to the continuous inputs?

lf_all %>% 
  ggplot(mapping = aes(x = value, y = outcome)) +
  geom_point(size = 3.5, alpha = 0.5) +
  facet_wrap(~name, scales = "free")+
  theme_bw()

log_all %>% 
  ggplot(mapping = aes(x = xb_07, y = outcome)) +
  geom_point(size = 3.5, alpha = 0.5) +
  theme_bw()